#alexander f. gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: Process county employment data for construction of treatment groups and controls
#   used in subesquent analyses

#Load packages
library(tidyverse)
library(tidylog)
library(data.table)
library(here)

#http://fpeckert.me/cbp/
#load data
cbp <- fread(here("data", "input", "cbp_naics", "efsy_panel_naics.csv"))

#Adjust FIPS codes for continuity for the handful of counties that change
cbp[cbp$fipstate==46&cbp$fipscty==113&cbp$year==2016,]$fipscty <- 102
cbp[cbp$fipstate==12&cbp$fipscty==25&cbp$year==2000,]$fipscty <- 86
#from wikipedia: Columbus is not an independent city as the number suggests.
#It is a consolidated city-county with Muscogee County, incorporating everything outside of Fort Benning
cbp[cbp$fipstate==13&cbp$fipscty==510,]$fipscty <- 215
#Genevieve County in Missouri changed its FIPS code from 29193 to 29186 in. 1979
#302 29193  1976
#303 29193  1980
cbp[cbp$fipstate==29&cbp$fipscty==193&cbp$year==1980,]$fipscty<-186

#subset data to relevant industry codes
cbp_sub <- subset(cbp, naics12 %in% c("2121//", "212111", "212112", "212113", "213112", "213111", "221112"))
#create categorical variable by industry type
cbp_sub <-
  cbp_sub %>%
  mutate(
    industry = case_when(
      naics12 %in% c("2121//","212111", "212112","212113") ~ "coal",
      naics12 %in% c("213112","213111") ~ "oilgas",
      naics12 == "221112" ~ "power"
    )
  )
#aggregate by county, year and industry
cbp_sub <- cbp_sub %>%
  group_by(fipstate,fipscty,industry,year) %>%
  summarise(emp = sum(emp))
#pivot wider
cbp_sub <- cbp_sub %>%
  pivot_wider(values_from=emp,names_from=industry,values_fill=0)
#create total county employment aggregate
cbp_all <- cbp %>%
  group_by(fipstate,fipscty,year) %>%
  summarise(emp_all = sum(emp))
#Merge with main data to create county share
cbp_merge <- left_join(cbp_all, cbp_sub, by = c("fipstate","fipscty","year"))
#replace NA with 0 since there is no employment in those industries
cbp_merge <- cbp_merge %>% mutate(across(coal:oilgas, ~ ifelse(is.na(.x), 0, .x)))
#Add 2019 and 2020 data, since the imputed data end in 2018
#load 2019 data
cbp19 <- fread(here("data", "input", "cbp", "cbp19co.txt"))
cbp19$year <- 2019
cbp20 <- fread(here("data", "input", "cbp", "cbp20co.txt"))
cbp20$year <- 2020
#merge new and imputed data
cbpnew <- bind_rows(cbp19,cbp20)
#aggregate to total
total <- cbpnew %>%
  filter(naics == "------") %>%
  group_by(fipstate, fipscty, year) %>%
  dplyr::summarise(emp_all = sum(emp, na.rm = TRUE))
#aggregate industries
cbpnew_sub <-
  cbpnew %>%
  mutate(naics = gsub("-", "", naics)) %>%
  filter(naics %in% c("2121//", "213112","213111", "221112")) %>%
  mutate(
    industry = case_when(
      naics %in% c("2121//") ~ "coal",
      naics %in% c("213112","213111") ~ "oilgas",
      naics %in% c("221112") ~ "power"
    )
  ) %>%
  dplyr::select(-naics) %>%
  group_by(fipstate, fipscty, year, industry) %>%
  summarise(emp = sum(emp, na.rm = TRUE))
#pivot wider
cbpnew_sub <- pivot_wider(cbpnew_sub, names_from=industry,values_from=emp,values_fill=0)
#merge together
cbpnew_merge <- left_join(total, cbpnew_sub, by = c("fipstate", "fipscty", "year"))
#convert NA to 0 since the county had none of those jobs
cbpnew_merge <- cbpnew_merge %>% mutate(across(power:coal, ~ ifelse(is.na(.x), 0, .x)))
#bind together
cbp_out <- bind_rows(cbp_merge, cbpnew_merge)
#pad county fips codes to prepare to concatenate them with the state FIPS codes
cbp_out$fipscty <- stringr::str_pad(cbp_out$fipscty, 3, "left", "0")
cbp_out$fips <- paste0(cbp_out$fipstate, cbp_out$fipscty)
#subset to the essential variables
cbp_out <- subset(cbp_out, select = -c(fipstate, fipscty))
cbp_out$fips <- as.numeric(cbp_out$fips)
#remove invalid fips codes
cbp_out <- subset(cbp_out, !grepl("999", str_sub(fips, -3, -1)))
#save output
saveRDS(cbp_out, here("data", "inter", "cbp.rds"))
